In [26]:
import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
%matplotlib inline
In [43]:
import matplotlib as mpl
font = {'family' : 'normal',
'weight' : 'normal',
'size' : 22}
mpl.rc('font', **font)
In [2]:
import nltk
# nltk.download("stopwords")
In [3]:
def removePunctuation(x):
# Lowercasing all words
x = x.lower()
# Removing non ASCII chars
x = re.sub(r'[^\x00-\x7f]',r' ',x)
# Removing (replacing with empty spaces actually) all the punctuations
return re.sub("["+string.punctuation+"]", " ", x)
In [4]:
stops = set(stopwords.words("english"))
def removeStopwords(x):
# Removing all the stopwords
filtered_words = [word for word in x.split() if word not in stops]
return " ".join(filtered_words)
In [2]:
dataframe_raw = {
"cooking": pd.read_csv("../data/cooking.csv"),
"crypto": pd.read_csv("../data/crypto.csv"),
"robotics": pd.read_csv("../data/robotics.csv"),
"biology": pd.read_csv("../data/biology.csv"),
"travel": pd.read_csv("../data/travel.csv"),
"diy": pd.read_csv("../data/diy.csv"),
}
In [4]:
def printExample(df):
example = df.sample(1)
print("Title: " + str(example['title'].values[0]))
if 'tags' in df.columns:
print("Tags: " + str(example['tags'].values[0]))
print(str(example['content'].values[0]))
printExample(dataframe_raw['biology'])
In [5]:
test = pd.read_csv('../data/test.csv')
printExample(test)
In [11]:
dataframe = {
"cooking": pd.read_csv("../data/cooking_light.csv"),
"crypto": pd.read_csv("../data/crypto_light.csv"),
"robotics": pd.read_csv("../data/robotics_light.csv"),
"biology": pd.read_csv("../data/biology_light.csv"),
"travel": pd.read_csv("../data/travel_light.csv"),
"diy": pd.read_csv("../data/diy_light.csv"),
}
In [8]:
dataframe['diy'].sample(3)[["title", "content", "tags"]]
Out[8]:
In [9]:
test.shape
Out[9]:
In [12]:
dataframe['physics'] = test
In [22]:
dataframe.keys()
Out[22]:
In [63]:
topics = list()
nExamples = list()
nTags = list()
tagsHist = list()
for topic in dataframe.keys():
topics.append(topic)
nExamples.append(dataframe[topic].shape[0])
if topic == 'physics':
nTags.append(0)
print(topic + ": " + str(nExamples[-1]))
else:
tagCountList = [len(row['tags'].split('\', \'')) for i, row in dataframe[topic].iterrows()]
nTags.append(np.sum(tagCountList))
tagsHist.extend(tagCountList)
print(topic + ": " + str(nExamples[-1]) + " tags: " + str(nTags[-1]))
In [46]:
x = np.arange(len(nExamples))
plt.figure(figsize=(14,6))
plt.bar(x, nExamples, align='center')
plt.xticks(x, topics)
plt.ylabel("Number of Examples")
Out[46]:
In [84]:
bins=np.arange(1,7)
plt.figure(figsize=(14,6))
plt.hist(tagsHist, align='left', bins=bins)
plt.xticks(bins)
plt.xlabel("Tags per example")
Out[84]:
In [60]:
dataframe['crypto']['tags'].apply(lambda x: len(x.split('\', \''))).sort_values(ascending=False)
Out[60]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: